{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DeepLake Vector Store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/adilkhansarsen/Documents/work/LlamaIndex/llama_index/GPTIndex/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import textwrap\n",
    "\n",
    "from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document\n",
    "from llama_index.vector_stores import DeepLakeVectorStore\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-********************************\"\n",
    "os.environ[\"ACTIVELOOP_TOKEN\"] = \"********************************\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install deeplake"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "if you don't export token in your environment alternativalay you can use deeplake CLI to loging to deeplake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !activeloop login -t <TOKEN>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Document ID: 14935662-4884-4c57-ac2e-fa62da019665 Document Hash: 77ae91ab542f3abb308c4d7c77c9bc4c9ad0ccd63144802b7cbe7e1bb3a4094e\n"
     ]
    }
   ],
   "source": [
    "# load documents\n",
    "documents = SimpleDirectoryReader(\"../paul_graham_essay/data\").load_data()\n",
    "print(\"Document ID:\", documents[0].doc_id, \"Document Hash:\", documents[0].doc_hash)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Your Deep Lake dataset has been successfully created!\n",
      "The dataset is private so make sure you are logged in!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "|"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/adilkhan/paul_graham_essay\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hub://adilkhan/paul_graham_essay loaded successfully.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Evaluating ingest: 100%|██████████| 1/1 [00:21<00:00\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 17617 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='hub://adilkhan/paul_graham_essay', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
      "\n",
      "  tensor     htype     shape     dtype  compression\n",
      "  -------   -------   -------   -------  ------- \n",
      " embedding  generic  (6, 1536)   None     None   \n",
      "    ids      text     (6, 1)      str     None   \n",
      " metadata    json     (6, 1)      str     None   \n",
      "   text      text     (6, 1)      str     None   \n"
     ]
    }
   ],
   "source": [
    "# dataset_path = \"hub://adilkhan/paul_graham_essay\" # if we comment this out and don't pass the path then GPTDeepLakeIndex will create dataset in memory\n",
    "from llama_index.storage.storage_context import StorageContext\n",
    "\n",
    "\n",
    "dataset_path = \"paul_graham_essay\"\n",
    "\n",
    "# Create an index over the documnts\n",
    "vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "if we decide to not pass the path then GPTDeepLakeIndex will create dataset locally called llama_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "llama_index loaded successfully.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Evaluating ingest: 100%|██████████| 1/1 [00:04<00:00\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 17617 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='llama_index', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
      "\n",
      "  tensor     htype     shape     dtype  compression\n",
      "  -------   -------   -------   -------  ------- \n",
      " embedding  generic  (6, 1536)   None     None   \n",
      "    ids      text     (6, 1)      str     None   \n",
      " metadata    json     (6, 1)      str     None   \n",
      "   text      text     (6, 1)      str     None   \n"
     ]
    }
   ],
   "source": [
    "# Create an index over the documnts\n",
    "# vector_store = DeepLakeVectorStore(overwrite=True)\n",
    "# storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "# index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4028 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 6 tokens\n"
     ]
    }
   ],
   "source": [
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\n",
    "    \"What did the author learn?\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  The author learned that working on things that are not prestigious can be a good thing, as it can\n",
      "lead to discovering something real and avoiding the wrong track. The author also learned that\n",
      "ignorance can be beneficial, as it can lead to discovering something new and unexpected. The author\n",
      "also learned the importance of working hard, even at the parts of the job they don't like, in order\n",
      "to set an example for others. The author also learned the value of unsolicited advice, as it can be\n",
      "beneficial in unexpected ways, such as when Robert Morris suggested that the author should make sure\n",
      "Y Combinator wasn't the last cool thing they did.\n"
     ]
    }
   ],
   "source": [
    "print(textwrap.fill(str(response), 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4072 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 9 tokens\n"
     ]
    }
   ],
   "source": [
    "response = query_engine.query(\"What was a hard moment for the author?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " A hard moment for the author was when he was dealing with urgent problems during YC and about 60%\n",
      "of them had to do with Hacker News, a news aggregator he had created. He was overwhelmed by the\n",
      "amount of work he had to do to keep Hacker News running, and it was taking away from his ability to\n",
      "focus on other projects. He was also haunted by the idea that his own work ethic set the upper bound\n",
      "for how hard everyone else worked, so he felt he had to work very hard. He was also dealing with\n",
      "disputes between cofounders, figuring out when people were lying to them, and fighting with people\n",
      "who maltreated the startups. On top of this, he was given unsolicited advice from Robert Morris to\n",
      "make sure Y Combinator wasn't the last cool thing he did, which made him consider quitting.\n"
     ]
    }
   ],
   "source": [
    "print(textwrap.fill(str(response), 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 4072 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 9 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " A hard moment for the author was when he was dealing with urgent problems during YC and about 60%\n",
      "of them had to do with Hacker News, a news aggregator he had created. He was overwhelmed by the\n",
      "amount of work he had to do to keep Hacker News running, and it was taking away from his ability to\n",
      "focus on other projects. He was also haunted by the idea that his own work ethic set the upper bound\n",
      "for how hard everyone else worked, so he felt he had to work very hard. He was also dealing with\n",
      "disputes between cofounders, figuring out when people were lying to them, and fighting with people\n",
      "who maltreated the startups. On top of this, he was given unsolicited advice from Robert Morris to\n",
      "make sure Y Combinator wasn't the last cool thing he did, which made him consider quitting.\n"
     ]
    }
   ],
   "source": [
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\"What was a hard moment for the author?\")\n",
    "print(textwrap.fill(str(response), 100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deleting items from the database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\\"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/adilkhan/paul_graham_essay\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\\"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hub://adilkhan/paul_graham_essay loaded successfully.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " \r"
     ]
    }
   ],
   "source": [
    "import deeplake as dp\n",
    "\n",
    "\n",
    "ds = dp.load(\"paul_graham_essay\")\n",
    "\n",
    "idx = ds.ids[0].numpy().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6/6 [00:00<00:00, 4501.13it/s]\n",
      " \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset(path='hub://adilkhan/paul_graham_essay', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
      "\n",
      "  tensor     htype     shape     dtype  compression\n",
      "  -------   -------   -------   -------  ------- \n",
      " embedding  generic  (5, 1536)   None     None   \n",
      "    ids      text     (5, 1)      str     None   \n",
      " metadata    json     (5, 1)      str     None   \n",
      "   text      text     (5, 1)      str     None   \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": []
    }
   ],
   "source": [
    "index.delete(idx[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "vscode": {
   "interpreter": {
    "hash": "6e44765f40d39e4c6e3d7a9b35e5b42b8711c1c0fb3c237b84fa62e4b3e35e04"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
